In [1]:
import pandas as pd
path = "../../../software-data/projects/linux/linux_blame_log.csv.gz"
log = pd.read_csv(path)
log.head()
Out[1]:
In [2]:
log.info()
In [3]:
top10 = log['author'].value_counts().head(10)
top10
Out[3]:
In [4]:
%matplotlib inline
top10.plot.pie();
In [5]:
log['timestamp'] = pd.to_datetime(log['timestamp'])
log.head()
Out[5]:
In [6]:
log['age'] = pd.Timestamp("today") - log['timestamp']
log.head()
Out[6]:
In [7]:
log['component'] = log['path'].str.split("/").str[:2].str.join(":")
log.head()
Out[7]:
In [8]:
age_per_component = log.groupby('component')['age'].min().sort_values()
age_per_component.head()
Out[8]:
In [9]:
age_per_component.plot.bar(
title="Alter pro Komponente (in Jahren)",
figsize=[15,5]);
In [10]:
knowledge = log.groupby(
['path', 'author']).agg(
{'timestamp':'min', 'line':'count'}
)
knowledge.head()
Out[10]:
In [11]:
knowledge['all_lines'] = knowledge.groupby('path')['line'].transform('sum')
knowledge['knowing'] = knowledge['line'] / knowledge['all_lines']
knowledge.head()
Out[11]:
In [12]:
max_knowledge_per_file = knowledge.groupby(['path'])['knowing'].transform(max)
knowledge_carriers = knowledge[knowledge['knowing'] == max_knowledge_per_file]
knowledge_carriers = knowledge_carriers.reset_index(level=1)
knowledge_carriers.head()
Out[12]:
In [14]:
from ausi import d3
d3.create_json_for_zoomable_circle_packing(
knowledge_carriers.reset_index(),
'author',
'author',
'path',
'/',
'all_lines',
'knowing',
'linux_circle_packing'
)